"""Submit KCIDB data to DataWarehouse."""
import argparse
from collections import abc
from importlib import resources
import itertools
import json
import os
import pathlib
import typing

from cki_lib import gitlab
from cki_lib import messagequeue
from cki_lib import metrics
from cki_lib import misc
from cki_lib import yaml
from cki_lib.kcidb import validate_extended_kcidb_schema
from cki_lib.logger import get_logger
from cki_lib.session import get_session
import datawarehouse
from gitlab.exceptions import GitlabGetError
import prometheus_client
import requests

LOGGER = get_logger('cki.datawarehouse_submitter')
SESSION = get_session('cki.datawarehouse_submitter', timeout=600)


METRIC_RESULTS_COUNT = prometheus_client.Counter(
    'cki_datawarehouse_results_count', 'Number of results submitted to DataWarehouse',
    ['source'])
METRIC_RESULTS_SIZE = prometheus_client.Histogram(
    'cki_datawarehouse_results_size', 'Size of results submitted to DataWarehouse',
    ['source'])
METRIC_RESULTS_PROCESSED = prometheus_client.Counter(
    'cki_datawarehouse_results_processed', 'Number of messages processed by the submitter',
    ['status'])


def batched(iterable: abc.Iterable[typing.Any], n: int) -> abc.Iterable[tuple[typing.Any]]:
    """Batch data from the iterable into tuples of length n. The last batch may be shorter.

    Backport of Python 3.12.
    """
    if n < 1:
        raise ValueError("n must be at least one")
    it = iter(iterable)
    while batch := tuple(itertools.islice(it, n)):
        yield batch


class Ignore(Exception):
    """Tell the callback to silently ignore the message that caused the failure."""


class Log(Exception):
    """Tell the callback to only log the message that caused the failure."""


class Submitter:
    """Submit KCIDB data to DataWarehouse."""

    def __init__(
        self,
        *,
        batch_size: int | None = None,
        web_url: list[str] | None = None,
    ) -> None:
        """Submit KCIDB data to DataWarehouse."""
        self.dw_api = datawarehouse.Datawarehouse(os.environ['DATAWAREHOUSE_URL'],
                                                  os.environ['DATAWAREHOUSE_TOKEN_SUBMITTER'],
                                                  session=SESSION)
        self.batch_size = batch_size or 100
        self.web_url = web_url or []

    def batch_payload(
        self,
        data: dict[str, typing.Any],
        source,
    ) -> abc.Iterable[dict[str, typing.Any]]:
        """Yield the KCIDB payload batching the items if they exceed the size threshold."""
        checkouts = data.setdefault("checkouts", [])
        builds = data.setdefault("builds", [])
        tests = data.setdefault("tests", [])

        data_size = len(tests) + len(builds) + len(checkouts)
        METRIC_RESULTS_SIZE.labels(source).observe(data_size)

        # Should not submit payload with empty lists
        if data_size == 0:
            return

        # If the data is small enough, we don't need to split it
        if data_size < self.batch_size:
            yield data
            return

        for object_type in ('checkouts', 'builds', 'tests'):
            for batch in batched(data[object_type], self.batch_size):
                yield {
                    "version": data["version"],
                    object_type: batch,
                }

    def callback(self, **kwargs: typing.Any) -> None:
        """Submit KCIDB data to DataWarehouse."""
        try:
            self.process(**kwargs)
        except Ignore as exc:
            METRIC_RESULTS_PROCESSED.labels('ignored').inc()
            LOGGER.info('%s', exc.args[0])
        except Log:
            METRIC_RESULTS_PROCESSED.labels('logged').inc()
            LOGGER.exception('Discarding message!')
        except Exception:
            METRIC_RESULTS_PROCESSED.labels('retried').inc()
            raise
        else:
            METRIC_RESULTS_PROCESSED.labels('submitted').inc()

    def process(
        self,
        body: typing.Any = None,
        headers: dict[str, str] | None = None,
        only_production_or_staging: bool = True,
        **_: typing.Any,
    ) -> None:
        """Submit KCIDB data to DataWarehouse."""
        match message_type := (headers or {}).get('message-type'):
            case 'herder':
                if self.web_url and not any(body['web_url'].startswith(w) for w in self.web_url):
                    raise Ignore(f'Job {body["web_url"]} not in allow list')
                LOGGER.info('Processing data from %s', body['web_url'])
                source = 'gitlab'
                gl_instance, gl_job = gitlab.parse_gitlab_url(body['web_url'])
                with gl_instance:
                    try:
                        kcidb_all_content = gl_job.artifact('kcidb_all.json')
                    except (GitlabGetError, requests.exceptions.ChunkedEncodingError) as exc:
                        raise Ignore(f'Job {gl_job.name} has no kcidb_all.json file') from exc

                if not kcidb_all_content:
                    raise Ignore(f'Job {gl_job.name} has an empty kcidb_all.json file')

                try:
                    kcidb_data = json.loads(kcidb_all_content)
                except json.JSONDecodeError as exc:
                    raise Log(f'Job {gl_job.name} has an invalid kcidb_all.json file') from exc

            case 'amqp-bridge':
                LOGGER.info('Processing UMB data')
                source = 'umb'
                kcidb_data = body

            case 'file':
                LOGGER.info('Processing file data')
                source = 'file'
                kcidb_data = body

            case _:
                raise Log(f'Unknown message type {message_type}')

        try:
            validate_extended_kcidb_schema(kcidb_data)
        except Exception as exc:
            raise Log('Unable to validate with KCIDB schema') from exc

        if not misc.is_production_or_staging() and only_production_or_staging:
            raise Ignore('Would submit if in production or staging')

        for batched_data in self.batch_payload(kcidb_data, source):
            self.dw_api.kcidb.submit.create(data=batched_data)

        METRIC_RESULTS_COUNT.labels(source).inc()


def main(args: list[str] | None = None) -> None:
    """Submit results to DataWarehouse."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--job-url', help='Upload data from a single GitLab job')
    parser.add_argument('--kcidb-file', help='Upload data from KCIDB file')
    parsed_args = parser.parse_args(args)

    schema_path = resources.files(__package__) / 'schema.yml'
    config = yaml.load(
        schema_path=schema_path,
        contents=os.environ.get('DATAWAREHOUSE_SUBMITTER_CONFIG'),
        file_path=os.environ.get('DATAWAREHOUSE_SUBMITTER_CONFIG_PATH'),
    ) or {}

    submitter = Submitter(
        batch_size=config.get('batch_size'),
        web_url=config.get('web_url'),
    )

    if parsed_args.job_url:
        submitter.callback(
            headers={'message-type': 'herder'},
            body={'web_url': parsed_args.job_url},
            only_production_or_staging=False)
    elif parsed_args.kcidb_file:
        submitter.callback(
            headers={'message-type': 'file'},
            body=json.loads(pathlib.Path(parsed_args.kcidb_file).read_text('utf8')),
            only_production_or_staging=False)
    else:
        for source in ('gitlab', 'umb'):
            METRIC_RESULTS_COUNT.labels(source)
            METRIC_RESULTS_SIZE.labels(source)
        for status in ('ignored', 'logged', 'retried', 'submitted'):
            METRIC_RESULTS_PROCESSED.labels(status)
        metrics.prometheus_init()

        messagequeue.MessageQueue().consume_messages(
            os.environ.get('WEBHOOK_RECEIVER_EXCHANGE', 'cki.exchange.webhooks'),
            os.environ['DATAWAREHOUSE_SUBMITTER_ROUTING_KEYS'].split(),
            submitter.callback,
            queue_name=os.environ.get('DATAWAREHOUSE_SUBMITTER_QUEUE'),
        )
